Open In Colab

Validation¶

Validation of algorithms and transformation results.

Chap 7. Validação

  • Section 7.3 Resultados
    • Section 7.3.2 Semantic Annotation
      • Classificação de regras operativas, fatos, termos e nomes
    • Section 7.3.3 nlp2sbvr

Google colab¶

InĀ [1]:
%load_ext autoreload
%autoreload 2

import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  !rm -rf cfr2sbvr configuration checkpoint
  !git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
  %pip install -r cfr2sbvr/code/requirements.txt
  !cp -r cfr2sbvr/code/src/configuration .
  !cp -r cfr2sbvr/code/src/checkpoint .
  !cp -r cfr2sbvr/code/config.colab.yaml config.yaml
  DEFAULT_CONFIG_FILE="config.yaml"
else:
  DEFAULT_CONFIG_FILE="../config.yaml"

Imports¶

InĀ [2]:
# Standard library imports
import json
import os
import time
from datetime import datetime
from typing import List

# Third-party imports
import logging_setup.main as logging_setup
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mi
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import rules_taxonomy_provider.main as rules_taxonomy_provider
import scipy.stats as stats
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, spearmanr, pearsonr, linregress
from openai import OpenAI
from pydantic import BaseModel, Field

# Local modules
import configuration.main as configuration
import checkpoint.main as checkpoint
from checkpoint.main import (
  Document,
  DocumentProcessor,
  get_all_checkpoints,
  restore_checkpoint,
  save_checkpoint,
)
import llm_query.main as llm_query
from llm_query.main import query_instruct_llm
from rules_taxonomy_provider.main import RulesTemplateProvider

DEV_MODE = True

if DEV_MODE:
    # Development mode
    import importlib

    importlib.reload(configuration)
    importlib.reload(logging_setup)
    importlib.reload(checkpoint)
    importlib.reload(llm_query)
    importlib.reload(rules_taxonomy_provider)

# Ensure plots are displayed inline if using a Jupyter notebook
%matplotlib inline

Settings¶

Configuration¶

InĀ [3]:
# Load configuration
config = configuration.load_config(DEFAULT_CONFIG_FILE)

Logging¶

InĀ [4]:
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])
2024-12-15 01:44:30 - INFO - Logging is set up with daily rotation.

Checkpoints¶

Restore the checkpoint¶

InĀ [5]:
# Restore the checkpoint

# To run after extraction
last_checkpoint = configuration.get_last_filename(
    config["DEFAULT_CHECKPOINT_DIR"], "documents", "json"
)

logger.info(f"{last_checkpoint=}")

config["DEFAULT_CHECKPOINT_FILE"] = last_checkpoint

manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])
2024-12-15 01:44:30 - INFO - last_checkpoint='../data/checkpoints/documents-2024-12-08-10.json'
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-15 01:44:31 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-12-08-10.json.

General functions¶

InĀ [6]:
# Summary statistics
def summary_statistics(df):
    return df.describe()

# Token usage analysis
def token_usage_analysis(df):
    plt.figure(figsize=(10, 6))
    sns.histplot(df['total_tokens'], kde=True, bins=30)
    plt.title('Distribution of Total Tokens')
    plt.xlabel('Total Tokens')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(10, 6))
    sns.boxplot(x='doc_type', y='total_tokens', data=df)
    plt.title('Total Tokens by Document Type')
    plt.xlabel('Document Type')
    plt.ylabel('Total Tokens')
    plt.xticks(rotation=45)
    plt.show()

# Time efficiency analysis
def time_efficiency_analysis(df):
    plt.figure(figsize=(10, 6))
    sns.histplot(df['elapsed_time'], kde=True, bins=30)
    plt.title('Distribution of Elapsed Time')
    plt.xlabel('Elapsed Time (seconds)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='elapsed_time', y='tokens_per_second', data=df)
    plt.title('Tokens per Second vs Elapsed Time')
    plt.xlabel('Elapsed Time (seconds)')
    plt.ylabel('Tokens per Second')
    plt.show()

# Cost analysis
def cost_analysis(df):
    df['cost'] = (df['total_tokens'] / 1_000_000) * df['price_per_million_tokens']
    logger.info(f"Total cost: ${df['cost'].sum():.2f}")

    plt.figure(figsize=(10, 6))
    sns.histplot(df['cost'], kde=True, bins=30)
    plt.title('Distribution of Execution Cost')
    plt.xlabel('Cost ($)')
    plt.ylabel('Frequency')
    plt.show()

# Temporal trends analysis
def temporal_analysis(df):
    df['created_date'] = df['created'].dt.date
    daily_usage = df.groupby('created_date').size()

    plt.figure(figsize=(10, 6))
    daily_usage.plot()
    plt.title('Daily Prompt Executions')
    plt.xlabel('Date')
    plt.ylabel('Number of Executions')
    plt.show()

# Group performance comparison
def group_performance_analysis(df):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='model', y='elapsed_time', data=df)
    plt.title('Elapsed Time by Model')
    plt.xlabel('Model')
    plt.ylabel('Elapsed Time (seconds)')
    plt.xticks(rotation=45)
    plt.show()
InĀ [7]:
# Function to plot histogram for semscore and similarity_score side-by-side
def plot_histogram_side_by_side(df, title, xlabel, output_dir, filename):
    plt.figure(figsize=(12, 6))
    plt.hist(df["semscore"], bins=20, color="#D55E00", alpha=0.7, label="Semscore", linestyle="--", edgecolor="black")
    plt.hist(df["similarity_score"], bins=20, color="#0072B2", alpha=0.7, label="Similarity Score", linestyle="-", edgecolor="black")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")
    plt.legend()
    plt.grid(True)
    plt.show()

    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path

# Function to plot box plot for semscore and similarity_score side-by-side
def plot_boxplot_side_by_side(df, title, ylabel, output_dir, filename):
    plt.figure(figsize=(8, 6))
    boxplot = plt.boxplot(
        [df["semscore"].dropna(), df["similarity_score"].dropna()],
        labels=["Semscore", "Similarity Score"],
        patch_artist=True,
        boxprops=dict(color="black"),
        medianprops=dict(color="black"),
        capprops=dict(color="black"),
        whiskerprops=dict(color="black"),
    )
    colors = ["#D55E00", "#0072B2"]
    for patch, color in zip(boxplot['boxes'], colors):
        patch.set_facecolor(color)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path

# Function to plot density plot for semscore and similarity_score side-by-side
def plot_density_side_by_side(df, title, xlabel, output_dir, filename):
    plt.figure(figsize=(12, 6))
    df["semscore"].plot(kind="kde", color="#D55E00", alpha=0.7, linestyle="--", label="Semscore")
    df["similarity_score"].plot(kind="kde", color="#0072B2", alpha=0.7, linestyle="-", label="Similarity Score")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True)
    plt.show()

    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path

# Updated process_all_elements function

def process_all_elements_updated(element_data, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    excel_file_path = os.path.join(output_dir, "combined_analysis_results.xlsx")
    writer = pd.ExcelWriter(excel_file_path, engine="xlsxwriter")
    workbook = writer.book
    combined_df_list = []
    image_files = []

    for element_name, content in element_data.items():
        df = pd.DataFrame(content)
        numeric_cols = ["semscore", "similarity_score"]
        df[numeric_cols] = df[numeric_cols].astype(float)
        df["element_type"] = element_name
        combined_df_list.append(df)
        sheet_name = element_name[:31]
        worksheet = workbook.add_worksheet(sheet_name)
        writer.sheets[sheet_name] = worksheet
        row = 0

        # Histograms side-by-side
        plot_filename = f"histogram_side_by_side_{element_name}.png"
        plot_path = plot_histogram_side_by_side(
            df,
            f"Histograms of Semscore and Similarity Score - {element_name}",
            "Scores",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Boxplots side-by-side
        plot_filename = f"boxplot_side_by_side_{element_name}.png"
        plot_path = plot_boxplot_side_by_side(
            df,
            f"Boxplots of Semscore and Similarity Score - {element_name}",
            "Scores",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        try:
            # Density plots side-by-side
            plot_filename = f"density_side_by_side_{element_name}.png"
            plot_path = plot_density_side_by_side(
                df,
                f"Density Plots of Semscore and Similarity Score - {element_name}",
                "Scores",
                output_dir,
                plot_filename,
            )
            worksheet.insert_image(row, 0, plot_path)
            row += 20
            image_files.append(plot_path)
        except Exception as e:
            logger.error(f"Error plotting density side by side: {e}")

    combined_df = pd.concat(combined_df_list, ignore_index=True)

    # Combined Histograms side-by-side
    plot_filename = "histogram_side_by_side_combined.png"
    plot_path = plot_histogram_side_by_side(
        combined_df,
        "Combined Histograms of Semscore and Similarity Score",
        "Scores",
        output_dir,
        plot_filename,
    )
    writer.sheets["Combined"] = workbook.add_worksheet("Combined")
    writer.sheets["Combined"].insert_image(0, 0, plot_path)
    image_files.append(plot_path)

    # Combined Boxplots side-by-side
    plot_filename = "boxplot_side_by_side_combined.png"
    plot_path = plot_boxplot_side_by_side(
        combined_df,
        "Combined Boxplots of Semscore and Similarity Score",
        "Scores",
        output_dir,
        plot_filename,
    )
    writer.sheets["Combined"].insert_image(25, 0, plot_path)
    image_files.append(plot_path)

    # Combined Density Plots side-by-side
    plot_filename = "density_side_by_side_combined.png"
    plot_path = plot_density_side_by_side(
        combined_df,
        "Combined Density Plots of Semscore and Similarity Score",
        "Scores",
        output_dir,
        plot_filename,
    )
    writer.sheets["Combined"].insert_image(50, 0, plot_path)
    image_files.append(plot_path)

    writer.close()
    for image_file in image_files:
        if os.path.exists(image_file):
            os.remove(image_file)
    return combined_df
InĀ [8]:
def remove_section_symbol(input_string: str) -> str:
    """
    Removes the '§' symbol from the input string and trims whitespace.

    Args:
        input_string (str): The string from which to remove the '§' symbol.

    Returns:
        str: The cleaned string without the '§' symbol and leading/trailing whitespace.

    Raises:
        TypeError: If 'input_string' is not a string.
    """
    if not isinstance(input_string, str):
        raise TypeError("input_string must be a string")
    return input_string.replace("§", "").strip()
InĀ [9]:
def prompt_analysis(raw_data, output_dir):
    # Create a DataFrame from the raw data
    data = pd.DataFrame(
        raw_data,
        columns=["filename", "doc_type", "elapsed_time", "usage", "created", "model"],
    )

    # Transform 'created' to a human-readable datetime format
    data["created"] = pd.to_datetime(data["created"], unit="s")

    # Extract relevant information from the 'usage' dictionary
    data["completion_tokens"] = data["usage"].apply(lambda x: x["completion_tokens"])
    data["prompt_tokens"] = data["usage"].apply(lambda x: x["prompt_tokens"])
    data["total_tokens"] = data["usage"].apply(lambda x: x["total_tokens"])

    # Define a function to get reference model context length
    def get_reference_model_context_length(model):
        return reference_models.get(
            model, 128_000
        )  # Default to 128,000 if model is unknown

    # Define a function to get the price per million tokens
    def get_price_per_million_tokens(model):
        return price_per_million_tokens.get(
            model, 2.50
        )  # Default to 2.50 if model is unknown

    # Add context length and price per million tokens columns
    data["reference_context_length"] = data["model"].apply(
        get_reference_model_context_length
    )
    data["price_per_million_tokens"] = data["model"].apply(get_price_per_million_tokens)

    # Overall Statistics
    total_tokens = data["total_tokens"].sum()
    num_samples = len(data)
    average_elapsed_time = data["elapsed_time"].mean()
    estimated_cost = (
        data["total_tokens"] / 1_000_000 * data["price_per_million_tokens"]
    ).sum()
    average_percentage_context_length = (
        data["total_tokens"] / data["reference_context_length"]
    ).mean() * 100
    min_created = data["created"].min().strftime("%Y-%m-%d %H:%M:%S")
    max_created = data["created"].max().strftime("%Y-%m-%d %H:%M:%S")

    # Add filename column to each statistic for origin tracking
    filename = file_info["filename"]
    # Data and time of the execution
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Create Overall Statistics DataFrame
    overall_stats_df = pd.DataFrame(
        [
            {
                "Total Tokens": total_tokens,
                "Number of Samples": num_samples,
                "Average Elapsed Time (s)": average_elapsed_time,
                "Estimated Cost (USD)": estimated_cost,
                "Average Percentage of Context Length (%)": average_percentage_context_length,
                "Min Created Timestamp": min_created,
                "Max Created Timestamp": max_created,
                "origin": filename,
                "run_at": now,
            }
        ]
    )

    # Statistics by Sample Type (doc_type)
    stats_by_doc_type = (
        data.groupby("doc_type")
        .agg(
            total_tokens=("total_tokens", "sum"),
            num_samples=("doc_type", "count"),
            average_elapsed_time=("elapsed_time", "mean"),
            average_tokens=("total_tokens", "mean"),
            estimated_cost=(
                "total_tokens",
                lambda x: (x.sum() / 1_000_000)
                * data.loc[x.index, "price_per_million_tokens"].mean(),
            ),
            average_percentage_context_length=(
                "total_tokens",
                lambda x: (
                    x.mean() / data.loc[x.index, "reference_context_length"].mean()
                )
                * 100,
            ),
        )
        .reset_index()
    )
    stats_by_doc_type["filename"] = filename
    stats_by_doc_type["run_at"] = now

    # Statistics by Model
    stats_by_model = (
        data.groupby("model")
        .agg(
            total_tokens=("total_tokens", "sum"),
            num_samples=("model", "count"),
            average_elapsed_time=("elapsed_time", "mean"),
            average_tokens=("total_tokens", "mean"),
            average_percentage_context_length=(
                "total_tokens",
                lambda x: (x.mean() / get_reference_model_context_length(x.name)) * 100,
            ),
        )
        .reset_index()
    )
    stats_by_model["filename"] = filename
    stats_by_model["run_at"] = now

    # Add estimated cost and cost columns separately since they require different calculations
    def calculate_group_cost(model):
        price = get_price_per_million_tokens(model)
        total_tokens = data[data["model"] == model]["total_tokens"].sum()
        return (total_tokens / 1_000_000) * price

    stats_by_model["estimated_cost"] = stats_by_model["model"].apply(
        calculate_group_cost
    )
    stats_by_model["cost"] = stats_by_model["estimated_cost"]

    # Calculate Tokens per Second
    # Ensure there are no division by zero issues by filtering out zero elapsed times
    data = data[data["elapsed_time"] > 0]
    data["tokens_per_second"] = data["total_tokens"] / data["elapsed_time"]

    # Write the statistics to an Excel file
    file_name = os.path.join(output_dir, "prompt-analysis.xlsx")

    with pd.ExcelWriter(file_name, engine="openpyxl") as writer:
        # Replace the data on each sheet with the new data
        overall_stats_df.to_excel(writer, sheet_name="Overall Statistics", index=False)
        stats_by_doc_type.to_excel(
            writer, sheet_name="Statistics by Sample Type", index=False
        )
        stats_by_model.to_excel(writer, sheet_name="Statistics by Model", index=False)
        additional_stats_df = pd.DataFrame(
            [
                {
                    "Average Completion Tokens": data["completion_tokens"].mean(),
                    "Average Prompt Tokens": data["prompt_tokens"].mean(),
                    "Average Total Tokens per Sample": data["total_tokens"].mean(),
                    "Total Elapsed Time (s)": data["elapsed_time"].sum(),
                    "Average Tokens per Second": data["tokens_per_second"].mean(),
                    "origin": filename,
                    "run_at": now,
                }
            ]
        )
        additional_stats_df.to_excel(
            writer, sheet_name="Additional Statistics", index=False
        )
        data.to_excel(writer, sheet_name="Raw Data", index=False)

        # Explanation Page
        explanation_data = {
            "Sheet Name": [
                "Overall Statistics",
                "Statistics by Sample Type",
                "Statistics by Model",
                "Additional Statistics",
                "Raw Data",
            ],
            "Description": [
                "Summary statistics of the entire dataset, including total tokens, number of samples, average elapsed time, and estimated cost.",
                "Statistics broken down by sample type (doc_type), including the total number of tokens and cost estimates for each type.",
                "Statistics grouped by the model used, showing token utilization, cost, and elapsed time for each model.",
                "Additional aggregated metrics such as average completion tokens, prompt tokens, total tokens per sample, and processing time.",
                "The raw data used for generating all the statistics, including individual completions and their details.",
            ],
            "Columns Explained": [
                "Total Tokens: Total number of tokens processed. Number of Samples: Total number of samples. Average Elapsed Time (s): Average time taken for processing. Estimated Cost (USD): Estimated cost for token usage. Average Percentage of Context Length (%): Average percentage of used context length. Min and Max Created Timestamp: The time range of the data collected. Origin: Source filename.",
                "doc_type: Type of document. total_tokens: Sum of tokens per document type. num_samples: Number of samples of this type. average_elapsed_time: Average time taken per document type. average_tokens: Average tokens per sample. estimated_cost: Estimated cost for tokens of this type. average_percentage_context_length: Average percentage of context length used. filename: Source filename.",
                "model: Model name. total_tokens: Total number of tokens used by the model. num_samples: Number of samples processed by the model. average_elapsed_time: Average processing time for the model. average_tokens: Average number of tokens per sample. average_percentage_context_length: Average context length percentage used. filename: Source filename. estimated_cost/cost: Cost for the tokens used by the model.",
                "Average Completion Tokens: Average number of completion tokens per sample. Average Prompt Tokens: Average number of prompt tokens per sample. Average Total Tokens per Sample: Average number of total tokens per sample. Total Elapsed Time (s): Total processing time for all samples. Average Tokens per Second: Average number of tokens processed per second. origin: Source filename.",
                "filename: Source filename. doc_type: Type of document. elapsed_time: Time taken for each document. usage: Token usage details (completion and prompt). created: Timestamp of creation. model: Model used.",
            ],
        }
        explanation_df = pd.DataFrame(explanation_data)
        explanation_df.to_excel(writer, sheet_name="Explanation", index=False)

    # Display Overall Statistics
    overall_stats_df_display = pd.DataFrame(
        [
            {
                "Total Tokens": total_tokens,
                "Number of Samples": num_samples,
                "Average Elapsed Time (s)": average_elapsed_time,
                "Estimated Cost (USD)": estimated_cost,
                "Average Percentage of Context Length (%)": average_percentage_context_length,
                "Min Created Timestamp": min_created,
                "Max Created Timestamp": max_created,
                "origin": filename,
                "run_at": now,
            }
        ]
    )
    print("\nOverall Statistics:")
    print(overall_stats_df_display.to_string(index=False))

    # Display Statistics by Sample Type
    print("\nStatistics by Sample Type (doc_type):")
    print(stats_by_doc_type.to_string(index=False))

    # Display Statistics by Model
    print("\nStatistics by Model:")
    print(stats_by_model.to_string(index=False))

    # Additional Statistics
    additional_stats_df_display = pd.DataFrame(
        [
            {
                "Average Completion Tokens": data["completion_tokens"].mean(),
                "Average Prompt Tokens": data["prompt_tokens"].mean(),
                "Average Total Tokens per Sample": data["total_tokens"].mean(),
                "Total Elapsed Time (s)": data["elapsed_time"].sum(),
                "Average Tokens per Second": data["tokens_per_second"].mean(),
                "origin": filename,
                "run_at": now,
            }
        ]
    )
    print("\nAdditional Statistics:")
    print(additional_stats_df_display.to_string(index=False))

    return data
InĀ [10]:
# Add similarity_classification based on similarity_score
def classify_similarity(score):
    if score == 1.0:
        return "identical"
    elif score >= 0.9:
        return "close-match"
    else:
        return "not-sure"


# Modify the highlight_similarity function to use three colors
def highlight_similarity(val):
    if val == "identical":
        color = "green"
    elif val == "close-match":
        color = "yellow"
    else:
        color = "red"
    return f"background-color: {color}"
InĀ [11]:
def create_df_elements_results(similarity_elements_results):
    # Build the dataframe
    df_results = pd.DataFrame(similarity_elements_results)

    df_results["similarity_classification"] = df_results["similarity_score"].apply(
        classify_similarity
    )

    df_results["classification_match"] = (
        df_results["classification_pred"] == df_results["classification_true"]
    )
    df_results["classification_match_label"] = df_results["classification_match"].map(
        {True: "match", False: "mismatch"}
    )

    df_results["source_match"] = df_results["source_pred"] == df_results["source_true"]
    df_results["source_match_label"] = df_results["source_match"].map(
        {True: "match", False: "mismatch"}
    )

    df_results["id_match"] = df_results["id_pred"] == df_results["id_true"]
    df_results["id_match_label"] = df_results["id_match"].map(
        {True: "match", False: "mismatch"}
    )

    return df_results
InĀ [12]:
class JudgeStatement(BaseModel):
    doc_id: str = Field(..., description="Document ID associated with the statement.")
    statement_id: str = Field(
        ...,
        description="A provided string that identifies the statement. e.g., '1', 'Person'.",
    )
    statement: str = Field(..., description="The statement to be transformed.")
    sources: List[str] = Field(..., description="Sources of the statement.")
    semscore: float = Field(..., description="just a copy from input semscore.")
    similarity_score: float = Field(
        ...,
        description="Similarity score between the original and transformed sentences.",
    )
    similarity_score_confidence: float = Field(
        ..., description="Confidence score for the similarity score."
    )
    transformation_accuracy: float = Field(
        ..., description="Accuracy score for the transformation."
    )
    grammar_syntax_accuracy: float = Field(
        ..., description="Accuracy score for the grammar and syntax."
    )
    findings: List[str] = Field(..., description="List of findings.")


class JudgeStatements(BaseModel):
    JudgeStatements: List[JudgeStatement] = Field(
        ..., description="List of judge statements."
    )
InĀ [13]:
def get_prompts_for_judge(rules, data_dir):
    rule_template_provider = RulesTemplateProvider(data_dir)

    system_prompts = []
    user_prompts = []

    for rule in rules:
        element_name = rule.get("element_name")

        if element_name == ["Term", "Name"]:
            statement_key = "definition"
            statement_id_key = "signifier"
        else:
            statement_key = "statement"
            statement_id_key = "statement_id"

        user_prompt = get_user_prompt_judge_sentence_similarity(element_name, rule)
        user_prompts.append(user_prompt)
        rule_templates_subtemplates = rule_template_provider.get_rules_template(
            rule["templates_ids"]
        )
        system_prompt = get_system_prompt_judge_sentence_similarity(
            rule_templates_subtemplates
        )
        system_prompts.append(system_prompt)
        logger.debug(system_prompt)
        logger.debug(user_prompt)

    logger.info(f"System prompts for {element_name}s: {len(system_prompts)}")
    logger.info(f"User prompts for {element_name}s: {len(user_prompts)}")

    return system_prompts, user_prompts, element_name
InĀ [14]:
def evaluate_statement(element_name, user_prompts, system_prompts, manager):
    # Initialize an empty list to accumulate all responses
    all_responses = []
    elapse_times = []
    completions = []

    # Loop through each pair of user and system prompts with a counter
    for index, (user_prompt, system_prompt) in enumerate(
        zip(user_prompts, system_prompts), start=1
    ):
        logger.info(f"Processing evaluation prompt {index} for {element_name}.")
        logger.debug(system_prompt)
        logger.debug(user_prompt)

        # Query the language model
        response, completion, elapse_time = query_instruct_llm(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            document_model=JudgeStatements,
            llm_model=config["LLM"]["MODEL"],
            temperature=config["LLM"]["TEMPERATURE"],
            max_tokens=config["LLM"]["MAX_TOKENS"],
        )

        logger.debug(f"{response}")

        # Accumulate the responses in the list
        all_responses.extend(response.JudgeStatements)
        elapse_times.append(elapse_time)
        completions.append(completion.dict())

        logger.info(f"Finished processing evaluation {index}.")

        logger.info("Waiting 2s before processing the next prompt to avoid rate limits")
        time.sleep(2)

    # After the loop, create a single Document with all the accumulated responses
    doc = Document(
        id=f"validation_judge_{element_name.replace(' ', '_')}s",
        type="llm_validation",
        content=all_responses,
        elapsed_times=elapse_times,
        completions=completions,
    )
    manager.add_document(doc)

    logger.info(f"{element_name}s: {len(all_responses)}")

    return all_responses
InĀ [15]:
def get_embedding(text, model="text-embedding-3-large"):
    client = OpenAI()
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def cosine_similarity(embedding1, embedding2):
    # Calcula a similaridade de cosseno entre dois embeddings
    embedding1 = np.array(embedding1)
    embedding2 = np.array(embedding2)
    cos_sim = np.dot(embedding1, embedding2) / (
        np.linalg.norm(embedding1) * np.linalg.norm(embedding2)
    )
    return cos_sim


def compare_sentences(sentence1, sentence2):
    # Obtem embeddings para as duas frases
    embedding1 = get_embedding(sentence1)
    embedding2 = get_embedding(sentence2)
    # Calcula a similaridade de cosseno entre os embeddings
    # similarity = cosine_similarity(embedding1, embedding2)
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity

Datasets¶

From section 7.2.4 Datasets

The dataset of the previous algorithm was adjusted with the gold standard dataset. The goal is to reduce the accumulation of errors from one step to the next.

The data adjusted:

  • § 275.0-2_P1, § 275.0-2_P2
  • § 275.0-5_P1, § 275.0-5_P2
  • § 275.0-7_P1, § 275.0-7_P2

True tables¶

There are no true tables to evaluate the transformation, the evaluation depends on the algorithms SEMSCORE and "LLM as a Judge".

Predicted values¶

Get predicted elements from all runs

InĀ [16]:
elements = []

managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])

for manager, file_info in zip(managers, file_info_list):
    # Process documents
    processor = DocumentProcessor(manager, merge=True)

    # Access processed data
    elements.append(
        {"pred_facts": processor.get_rules(),
        "pred_terms": processor.get_facts(),
        "pred_names": processor.get_terms(definition_filter="non_null"),
        "pred_operative_rules": processor.get_names(definition_filter="non_null"),
        "pred_file_info": file_info}
    )
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json

Set dataset to evaluation and check empty transformed elements

InĀ [17]:
for element_item in elements:
    for key in element_item.keys():
        if key == "pred_file_info":
            continue
        empty_transformed_elements = [
            item for item in element_item[key] if not item.get("transformed")
        ]
        logger.info(
            f'Empty transformed {element_item["pred_file_info"].get("filename")} {key}: {len(empty_transformed_elements)}/{len(element_item[key])}'
        )
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_operative_rules: 0/5
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_facts: 0/6
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_terms: 0/16
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_names: 0/28
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_operative_rules: 0/5

Check missing values.

InĀ [18]:
for element_item in elements:
    for key in element_item.keys():
        if key == "pred_file_info":
            continue
        element_df = pd.DataFrame(element_item[key])

        # Check if there are any missing values
        if element_df.isnull().any().any():
            mi.matrix(element_df, figsize=(10, 5))
            plt.title(f'Missing Values for {key} in {element_item["pred_file_info"].get("filename")}')
            plt.show()  # Ensure the plot displays
        else:
            logger.info(f'No missing values for {key} in {element_item["pred_file_info"].get("filename")}')
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-1.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-1.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-1.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-1.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-10.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-10.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-10.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-10.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-2.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-2.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-2.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-2.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-3.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-3.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-3.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-3.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-4.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-4.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-4.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-4.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-5.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-5.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-5.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-5.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-6.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-6.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-6.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-6.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-7.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-7.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-7.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-7.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-8.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-8.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-8.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-8.json
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-9.json
2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-9.json
2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-9.json
2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-9.json

Algorithms¶

Validation of algorithm from section 6.2 Implementation of main components

Source for section 7.3 Results

nlp2sbvr¶

Elements measurements from chapter 7.2.3 Terms, names, facts, and operative rules

Measuring similarity with SEMSCORE¶

Evaluating SEMSCORE (AYNETDINOV;AKBIK, 2024) between the predicted and true statements for each element.

WARNING: Expensive operation!

If the data is available could skip processing evaluation. Operation is expensive, if just need to compile the evaluation, set SKIP to True.

InĀ [19]:
SKIP = True
InĀ [20]:
if not SKIP:
    for element_item in elements:
        for key in element_item.keys():
            if key == "pred_file_info":
                continue
            for item in element_item[key]:
                original_sentence = f'{item.get("statement_id")}: {item.get("statement", item.get("definition"))}'
                transformed_sentence = item.get("transformed")
                templates_ids = item.get("templates_ids")
                element_name = item.get("element_name")

                logger.info(f"{original_sentence=}")
                logger.info(f"{transformed_sentence=}")
                logger.info(f"{templates_ids=}")
                logger.info(f"{element_name=}")
                logger.info(f"{key=}")
                logger.info(f'{element_item["pred_file_info"]=}')

                # Remove keys if they exist
                for key in [
                    "explanation",
                    "confidence",
                    "subtype_confidence",
                    "subtype_explanation",
                ]:
                    item.pop(key, None)  # Using pop with None to avoid KeyError

                logger.debug(f"{element_name=}")

                # Calculate similarity score
                similarity = compare_sentences(original_sentence, transformed_sentence)

                # Assign the calculated score to 'semscore'
                item["semscore"] = similarity
            else:
                logger.debug(
                    f"{item.get('element_name')} already has a semscore: {item['semscore']}"
                )

Check if SEMSCORE was calculated.

InĀ [21]:
for element_item in elements:
    for key in element_item.keys():
        if key == "pred_file_info":
            continue
        semscore_in_operative_rules = all(
            "semscore" in item and item["semscore"] is not None
            for item in element_item[key]
        )

        logger.info(
            f'{key} to evaluate: {len(element_item[key])}, semscore was calculated: {semscore_in_operative_rules}'
        )
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True
2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True

Evaluation criterias (SHANKAR et al., 2024)¶

Based on the prompt, there are three inferred evaluation criteria that align with the approach proposed by EvalGen (SHANKAR et al., 2024):

  1. Similarity Score

    • Given the original_sentence and tranformed_sentence, how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?
  2. Transformation Accuracy

    • From 0 to 1, how does the "transformed_sentence" reflect the original_sentence with the structure and phrasing provided by the template?
  3. Grammar and Syntax Accuracy

    • How is the transformed sentence grammatically correct and syntactically accurate from 0 to 1?

LLM-as-a-judge¶

References of the LLM-as-a-judge approach: (WEI; CHEN; LUO, 2024), (DONG; HU; COLLIER, 2024), (ZHENG et al., 2023)

Prompt engineering¶

System prompt

InĀ [22]:
def get_system_prompt_judge_sentence_similarity(template):
    return f"""
   # Task

   You're an expert in judging sentence similarity and transformation using a template. 

   These criteria should support the evaluation process by verifying classification accuracy, template application, and transformation fidelity.

   Check the criteria and evaluate the output:

   1. **Similarity Score**
      - Given the statement or definition and tranformed sentence (transformed), how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?

   2. **Transformation Accuracy**
      - From 0 to 1, how does the transformed sentence (transformed) reflect the original sentence (statement or definition) with the structure and phrasing provided by the template and subtemplates?

   3. **Grammar and Syntax Accuracy**
      - How is the transformed sentence (transformed) grammatically correct and syntactically accurate from 0 to 1?

   # Output Format

   Record your evaluation in JSON format as follows:

   ```json
   {{
      "doc_id": "<Document ID>",
      "statement_id": "<Statement ID>",
      "sources": ["<source>"],
      "similarity_score": <Similarity score>,
      "similarity_score_confidence": <Confidence score>,
      "transformation_accuracy": <Transformation score>,
      "grammar_syntax_accuracy": <Grammar score>,
      "findings": ["<Things found during the evaluation and worth to be mentioned>", 
                  "<other things to mention>"
                  ],
      "semscore": <original semscore>
   }}
   ```

   # Input example

   {{
      "doc_id": <Document ID>,
      "statement_id": <Statement ID>,
      "statement or definition": <original sentence>,
      "sources": [<source>],
      "terms": [
         {{"term": <signifier>, "classification": <Proper or Common Noun>}},
         ...
      ],
      "verb_symbols": <verbs or phrasal verbs>,
      "element_name": <name of element: Name, Term, Fact, Fact Type, Operative Rule>,
      "transformed": <transformed sentence>,
      "type": <type of element: Definitional, Activity, Party, Data>,
      "subtype": <subtype of element>,
      "templates_ids": ["T8"],
      "semscore": <semscore>
   }}

   # Templates and Subtemplates

   {template}
   """

User prompt

InĀ [23]:
def get_user_prompt_judge_sentence_similarity(element_name, rule):
    return f"""
# rule data for an element: {element_name}

{json.dumps(rule, indent=2)}
    """
Measuring similarity with LLM Judge¶

Preparing system and user prompts for each element and call the judge.

InĀ [24]:
if not SKIP:
    for element_item in elements:
        for key in element_item.keys():
            if key == "pred_file_info":
                continue
            system_prompts, user_prompts, element_name = get_prompts_for_judge(
                element_item[key], config["DEFAULT_DATA_DIR"]
            )

            logger.debug(f"{system_prompts=}")
            logger.debug(f"{user_prompts=}")

            responses = evaluate_statement(
                element_name=element_name,
                user_prompts=user_prompts,
                system_prompts=system_prompts,
                manager=manager,
            )

            # Persist the state to a file
            filename=f'{config["DEFAULT_CHECKPOINT_DIR"]}/{element_item["pred_file_info"].get("filename")}'
            logger.debug(f"Saving the state to a file for {filename}")
            save_checkpoint(filename=filename, manager=manager)

Average similarity score per document 5s.

Elements evaluation¶

InĀ [25]:
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
InĀ [26]:
eval_operative_rules = []
eval_facts = []
eval_terms = []
eval_names = []

for manager, file_info in zip(managers, file_info_list):
    # Process documents
    eval_operative_rules.extend(manager.retrieve_document(
        "validation_judge_Operative_Rules", "llm_validation"
    ).content)
    eval_names.extend(manager.retrieve_document(
        "validation_judge_Names", "llm_validation"
    ).content)
    eval_terms.extend(manager.retrieve_document(
        "validation_judge_Terms", "llm_validation"
    ).content)
    eval_facts.extend(manager.retrieve_document(
        "validation_judge_Fact_Types", "llm_validation"
    ).content)

logger.info(f"Operative Rules: {len(eval_operative_rules)}")
logger.info(f"Names: {len(eval_names)}")
logger.info(f"Terms: {len(eval_terms)}")
logger.info(f"Facts: {len(eval_facts)}")
2024-12-15 01:44:32 - INFO - Operative Rules: 60
2024-12-15 01:44:32 - INFO - Names: 50
2024-12-15 01:44:32 - INFO - Terms: 280
2024-12-15 01:44:32 - INFO - Facts: 160
InĀ [27]:
elements_data = {
    "Operative_Rules": eval_operative_rules,
    "Names": eval_names,
    "Terms": eval_terms,
    "Fact_Types": eval_facts,
}
InĀ [28]:
for key in elements_data.keys():
    logger.info(f"{key}: {len(elements_data[key])}")
2024-12-15 01:44:32 - INFO - Operative_Rules: 60
2024-12-15 01:44:32 - INFO - Names: 50
2024-12-15 01:44:32 - INFO - Terms: 280
2024-12-15 01:44:32 - INFO - Fact_Types: 160

Checking missing data

InĀ [29]:
for element_key in elements_data.keys():
    element_df = pd.DataFrame(elements_data[element_key])
    mi.matrix(element_df, figsize=(10, 5))
    plt.title(f"Missing Values for {element_key}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Metrics¶

InĀ [30]:
combined_df = process_all_elements_updated(elements_data, config["DEFAULT_OUTPUT_DIR"])
No description has been provided for this image
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  boxplot = plt.boxplot(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  boxplot = plt.boxplot(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  boxplot = plt.boxplot(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  boxplot = plt.boxplot(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  boxplot = plt.boxplot(
No description has been provided for this image
No description has been provided for this image

Describing the metrics semscore and similarity_score

InĀ [31]:
combined_df.groupby("element_type")[["semscore", "similarity_score"]].describe()#.to_excel(config["DEFAULT_OUTPUT_DIR"] + "/sem_sim_descriptive_stats.xlsx")
Out[31]:
semscore similarity_score
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
element_type
Fact_Types 160.0 0.875031 0.054416 0.714021 0.821412 0.883450 0.918956 0.972246 160.0 0.920000 0.056413 0.70 0.90 0.95 0.95 0.95
Names 50.0 0.885333 0.030958 0.810035 0.883378 0.902060 0.904557 0.909534 50.0 0.949000 0.007071 0.90 0.95 0.95 0.95 0.95
Operative_Rules 60.0 0.907990 0.019192 0.873461 0.889514 0.910879 0.922699 0.933172 60.0 0.903333 0.047716 0.75 0.90 0.90 0.95 0.95
Terms 280.0 0.848829 0.077379 0.504298 0.829878 0.851205 0.904896 0.960378 280.0 0.920536 0.051306 0.60 0.90 0.95 0.95 1.00

See correlation analysis below

Similarity_score and confidence

InĀ [32]:
combined_df.groupby("element_type")[["similarity_score", "similarity_score_confidence"]].describe()
Out[32]:
similarity_score similarity_score_confidence
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
element_type
Fact_Types 160.0 0.920000 0.056413 0.70 0.90 0.95 0.95 0.95 160.0 0.891563 0.026435 0.7 0.90 0.9 0.9 0.90
Names 50.0 0.949000 0.007071 0.90 0.95 0.95 0.95 0.95 50.0 0.901000 0.007071 0.9 0.90 0.9 0.9 0.95
Operative_Rules 60.0 0.903333 0.047716 0.75 0.90 0.90 0.95 0.95 60.0 0.879167 0.029533 0.8 0.85 0.9 0.9 0.90
Terms 280.0 0.920536 0.051306 0.60 0.90 0.95 0.95 1.00 280.0 0.889107 0.033024 0.8 0.85 0.9 0.9 1.00
InĀ [33]:
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
    lambda group: group["similarity_score"].corr(group["similarity_score_confidence"])
).reset_index(name="correlation")
/tmp/ipykernel_127502/3731941340.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  combined_df.groupby("element_type").apply(
Out[33]:
element_type correlation
0 Fact_Types 0.598877
1 Names -1.000000
2 Operative_Rules 0.591359
3 Terms 0.555577

transformation_accuracy and grammar_syntax_accuracy

InĀ [34]:
combined_df.groupby("element_type")[["transformation_accuracy", "grammar_syntax_accuracy"]].describe()
Out[34]:
transformation_accuracy grammar_syntax_accuracy
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
element_type
Fact_Types 160.0 0.866875 0.089000 0.50 0.85 0.90 0.90 0.95 160.0 0.931250 0.088622 0.50 0.95 0.95 0.95 1.00
Names 50.0 0.919000 0.026515 0.85 0.90 0.90 0.95 0.95 50.0 0.970000 0.024744 0.95 0.95 0.95 1.00 1.00
Operative_Rules 60.0 0.840000 0.073531 0.70 0.80 0.85 0.90 0.90 60.0 0.927500 0.054792 0.70 0.95 0.95 0.95 0.95
Terms 280.0 0.879464 0.080086 0.50 0.80 0.90 0.95 1.00 280.0 0.954821 0.064090 0.40 0.95 0.95 1.00 1.00
InĀ [35]:
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
    lambda group: group["transformation_accuracy"].corr(group["grammar_syntax_accuracy"])
).reset_index(name="correlation")
/tmp/ipykernel_127502/1485779397.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  combined_df.groupby("element_type").apply(
Out[35]:
element_type correlation
0 Fact_Types 0.680278
1 Names 0.808757
2 Operative_Rules 0.689936
3 Terms 0.736881

Correlation analysis similarity_score and semscore¶

Top 10 lowest semscore

InĀ [36]:
# Make a copy of the DataFrame for further analysis
df_aval = combined_df.copy()

df_similarity = combined_df.copy()

df_agree = combined_df.copy()

Top 10 lowest similarity_score

InĀ [37]:
df_smallest = df_aval.nsmallest(15, ["semscore"])
InĀ [38]:
df_smallest
Out[38]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type
295 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.504298 0.85 0.90 0.8 0.95 [The transformed sentence accurately reflects ... Terms
211 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.534698 0.85 0.90 0.8 0.95 [The transformed sentence accurately reflects ... Terms
155 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588489 0.90 0.85 0.9 0.95 [The transformed sentence maintains the meanin... Terms
183 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588578 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms
323 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588613 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms
127 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588633 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms
351 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588633 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms
239 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588672 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms
379 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588672 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms
267 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.589421 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms
329 § 275.0-7 Control The power, directly or indirectly, to direct t... [(b)(1), (a)(3)] 0.685248 0.90 0.85 0.8 0.95 [The transformed sentence maintains the core m... Terms
217 § 275.0-7 Control The power, directly or indirectly, to direct t... [(b)(1), (a)(3)] 0.711167 0.85 0.90 0.8 0.95 [The transformed sentence maintains the core m... Terms
245 § 275.0-7 Control The power, directly or indirectly, to direct t... [(b)(1), (a)(3)] 0.712439 0.85 0.90 0.8 0.95 [The transformed sentence maintains the core m... Terms
189 § 275.0-7 Control The power, directly or indirectly, to direct t... [(a)(3), (b)(1)] 0.712491 0.85 0.90 0.8 0.95 [The transformed sentence maintains the core m... Terms
385 § 275.0-7 Control The power, directly or indirectly, to direct t... [(b)(1), (a)(3)] 0.713834 0.90 0.85 0.8 0.95 [The transformed sentence maintains the core m... Terms
InĀ [39]:
# Convert the 'sources' column to a string type to allow dropping duplicates
df_aval["sources"] = df_aval["sources"].apply(str)

# Filter the distinct records based on doc_id, statement_id, statement, and sources
df_aval.drop_duplicates(subset=["doc_id", "statement_id", "statement", "sources"]).nsmallest(15, ["semscore"])
Out[39]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type
127 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... ['(c)'] 0.588633 0.90 0.85 0.90 0.95 [The transformed sentence accurately reflects ... Terms
115 § 275.0-5 Notice A publication in the Federal Register indicati... ['(a)'] 0.764826 0.95 0.90 0.95 1.00 [The transformed sentence accurately reflects ... Terms
130 § 275.0-7 Small business An investment adviser with assets under manage... ['(a)'] 0.766613 0.85 0.90 0.80 0.95 [The transformed sentence captures the main id... Terms
158 \n§ 275.0-7 Small business An investment adviser with assets under manage... ['(a)'] 0.766658 0.85 0.90 0.80 0.95 [The transformed sentence accurately reflects ... Terms
114 § 275.0-5 Order disposing of the matter An order issued after the period of time for s... ['(b)', '(a)'] 0.774899 0.90 0.85 0.80 0.95 [The transformed sentence maintains the core m... Terms
142 § 275.0-5 Order disposing of the matter An order issued after the period of time for s... ['(a)', '(b)'] 0.775006 0.90 0.85 0.80 0.95 [The transformed sentence maintains the core m... Terms
414 § 275.0-7 3 An investment adviser did not have total asset... ['(a)(2)'] 0.792294 0.95 0.90 0.90 1.00 [The transformed sentence accurately reflects ... Fact_Types
398 \n§ 275.0-7 3 An investment adviser did not have total asset... ['(a)(2)'] 0.792303 0.95 0.90 0.90 1.00 [The transformed sentence accurately reflects ... Fact_Types
400 § 275.0-7 5 Control means the power, directly or indirectl... ['(b)(1)'] 0.795561 0.90 0.85 0.90 0.95 [The transformed sentence maintains the core m... Fact_Types
131 § 275.0-7 Small organization An investment adviser with assets under manage... ['(a)'] 0.810215 0.85 0.90 0.80 0.95 [The transformed sentence captures the main id... Terms
187 \n§ 275.0-7 Small organization An investment adviser with assets under manage... ['(a)'] 0.810718 0.85 0.90 0.80 0.95 [The transformed sentence captures the main id... Terms
395 § 275.0-5 5 For purposes of this rule, an application mean... ['(d)'] 0.812092 0.95 0.90 0.90 0.95 [The transformed sentence accurately reflects ... Fact_Types
128 § 275.0-5 Order of the Commission An order issued by the Commission under the Act. ['(d)'] 0.814453 0.60 0.80 0.50 0.40 [The transformed sentence does not accurately ... Terms
404 § 275.0-7 9 A person is presumed to control a trust if the... ['(b)(1)(iv)'] 0.820486 0.80 0.90 0.70 0.60 [The transformed sentence maintains the core i... Fact_Types
123 § 275.0-5 Reasons The justification provided by an interested pe... ['(a)'] 0.830164 0.95 0.90 0.90 0.95 [The transformed sentence accurately reflects ... Terms
InĀ [40]:
df_similarity['score_difference'] = df_similarity['similarity_score'] - df_similarity['semscore']
InĀ [41]:
df_similarity
Out[41]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type score_difference
0 § 275.0-2 3 The Secretary of the Commission (Secretary) wi... [(a)(2)] 0.907310 0.90 0.90 0.85 0.95 [The transformed sentence maintains the core m... Operative_Rules -0.007310
1 § 275.0-2 4 If the Secretary certifies that the Commission... [(a)(3)] 0.922476 0.90 0.85 0.90 0.95 [The transformed sentence maintains the origin... Operative_Rules -0.022476
2 § 275.0-5 1 Notice of the initiation of the proceeding wil... [(a)] 0.889516 0.95 0.90 0.90 0.95 [The transformed sentence maintains the origin... Operative_Rules 0.060484
3 § 275.0-5 2 Any interested person may, within the period o... [(a)] 0.881079 0.85 0.90 0.80 0.95 [The transformed sentence captures the essence... Operative_Rules -0.031079
4 § 275.0-5 3 An order disposing of the matter will be issue... [(b)] 0.933155 0.90 0.85 0.70 0.80 [The transformed sentence uses 'may be issued'... Operative_Rules -0.033155
... ... ... ... ... ... ... ... ... ... ... ... ...
545 § 275.0-7 6 A person is presumed to control a corporation ... [(b)(1)(i)(A)] 0.732264 0.85 0.90 0.80 0.95 [The transformed sentence maintains the core m... Fact_Types 0.117736
546 § 275.0-7 7 A person is presumed to control a partnership ... [(b)(1)(ii)] 0.889638 0.95 0.90 0.90 0.95 [The transformed sentence maintains the origin... Fact_Types 0.060362
547 § 275.0-7 8 A person is presumed to control a limited liab... [(b)(1)(iii)] 0.940801 0.95 0.90 0.90 0.95 [The transformed sentence maintains the origin... Fact_Types 0.009199
548 § 275.0-7 9 A person is presumed to control a trust if the... [(b)(1)(iv)] 0.821413 0.80 0.90 0.70 0.60 [The transformed sentence introduces 'by defin... Fact_Types -0.021413
549 § 275.0-7 10 Total assets means the total assets as shown o... [(b)(2)] 0.920697 0.95 0.90 0.90 0.95 [The transformed sentence closely follows the ... Fact_Types 0.029303

550 rows Ɨ 12 columns

InĀ [42]:
# Plot the semscore, similarity score, and score difference on the same graph
plt.figure(figsize=(12, 6))

plt.plot(df_similarity.index, df_similarity['semscore'], color='#D55E00', marker='x', linestyle='--', label='Semscore')
plt.plot(df_similarity.index, df_similarity['similarity_score'], color='#0072B2', marker='o', linestyle='-', label='Similarity Score')

plt.title('Semscore, and Similarity Score Across Records')
plt.xlabel('Record Index')
plt.ylabel('Scores')
plt.grid(True)
plt.legend()
plt.show()
No description has been provided for this image
InĀ [43]:
# Plot the score difference as a line chart
plt.figure(figsize=(10, 6))
plt.plot(df_similarity.index, df_similarity['score_difference'], marker='o', linestyle='-', label='Score Difference')
plt.title('Score Difference Across Records')
plt.xlabel('Record Index')
plt.ylabel('Score Difference')
plt.grid(True)
plt.legend()
plt.show()
No description has been provided for this image
InĀ [44]:
# Create an interactive scatter plot
fig = go.Figure()

marker_map = {
    'Operative_Rules': 'circle',
    'Names': 'x',
    'Terms': 'triangle-up',
    'Fact_Types': 'diamond'
}

# Add a trace for each element_type
unique_types = df_similarity['element_type'].unique()
for etype in unique_types:
    filtered_data = df_similarity[df_similarity['element_type'] == etype]
    fig.add_trace(go.Scatter(
        x=filtered_data.index,
        y=filtered_data['score_difference'],
        mode='lines+markers',
        marker=dict(symbol=marker_map[etype]),  # Wrap the symbol in a dictionary
        name=etype,
        visible=True  # Ensure all traces are visible initially
    ))

# Add dropdown to filter by element_type
dropdown_buttons = [
    dict(label="All",
         method="update",
         args=[{"visible": [True] * len(unique_types)},  # Show all traces
               {"title": "Score Difference - All Element Types"}]),
]

for i, etype in enumerate(unique_types):
    dropdown_buttons.append(
        dict(label=etype,
             method="update",
             args=[{"visible": [j == i for j in range(len(unique_types))]},  # Show only the selected trace
                   {"title": f"Score Difference - {etype}"}])
    )

fig.update_layout(
    updatemenus=[
        dict(
            buttons=dropdown_buttons,
            direction="down",
            showactive=True,
            x=0.1,
            y=1.15
        )
    ],
    title="Score Difference Across Element Types",
    xaxis_title="Record Index",
    yaxis_title="Score Difference",
    showlegend=True
)

fig.show()
InĀ [45]:
df_agree['score_difference'] = df_agree['similarity_score'] - df_agree['semscore']

# Calculate the required values
agree = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity = (1 - df_agree.loc[df_agree['score_difference'] > 0.01, 'score_difference']).sum()
semscore = (1 - df_agree.loc[df_agree['score_difference'] < 0.01, 'score_difference']).sum()

# Create a new DataFrame with the calculated values
summary_df = pd.DataFrame({
    'Metric': ['Agree', 'Similarity', 'Semscore'],
    'Value': [agree, similarity, semscore]
})

# Plot the histogram
plt.figure(figsize=(8, 6))
plt.bar(summary_df['Metric'], summary_df['Value'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metrics')
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
No description has been provided for this image
InĀ [46]:
# Display the calculated values
summary_df
Out[46]:
Metric Value
0 Agree 34.000000
1 Similarity 412.242183
2 Semscore 107.580991
InĀ [47]:
# Count the occurrences for each metric
agree_count = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity_count = (df_agree['score_difference'] > 0.01).sum()
semscore_count = (df_agree['score_difference'] < 0.01).sum()

# Create a new DataFrame with the counts
count_summary_df = pd.DataFrame({
    'Metric': ['Agree', 'Similarity', 'Semscore'],
    'Count': [agree_count, similarity_count, semscore_count]
})

# Plot the histogram for counts
plt.figure(figsize=(8, 6))
plt.bar(count_summary_df['Metric'], count_summary_df['Count'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metric Counts')
plt.xlabel('Metrics')
plt.ylabel('Counts')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
No description has been provided for this image
InĀ [48]:
# Display the calculated counts
count_summary_df
Out[48]:
Metric Count
0 Agree 34
1 Similarity 447
2 Semscore 103
InĀ [49]:
# Define the 10% margin
margin = 0.1

# Compute agreement within the +/-10% margin
agreement_margin = ((df_agree['similarity_score'] >= (df_agree['semscore'] - margin)) &
                    (df_agree['similarity_score'] <= (df_agree['semscore'] + margin))).sum()

# Compute disagreement outside the +/-10% margin
disagreement_margin = len(df_agree) - agreement_margin

# Display the results
agreement_disagreement_summary = pd.DataFrame({
    'Metric': ['Agreement', 'Disagreement'],
    'Count': [agreement_margin, disagreement_margin]
})
InĀ [50]:
agreement_disagreement_summary
Out[50]:
Metric Count
0 Agreement 412
1 Disagreement 138
InĀ [51]:
# Compute proportional agreement within the ±10% margin
df_agree['agreement_proportion'] = 1 - (df_agree['similarity_score'] - df_agree['semscore']).abs() / margin
#df_agree['agreement_proportion'] = df_agree['agreement_proportion'].clip(lower=0)  # Clip negative values to 0

# Plot the proportional agreement series
plt.figure(figsize=(12, 6))
plt.plot(df_agree.index, df_agree['agreement_proportion'], marker='o', linestyle='-', label='Proportional Agreement')
plt.title('Proportional Agreement Series')
plt.xlabel('Record Index')
plt.ylabel('Agreement Proportion')
plt.grid(True)
plt.legend()
plt.show()
No description has been provided for this image

Agree that something is bad

InĀ [52]:
# Define the threshold for "low" scores
low_threshold = 0.68

# Identify rows where both metrics are below the threshold
low_agreement_df = df_agree[
    #(df_agree['similarity_score'] < low_threshold)
    (df_agree['semscore'] < low_threshold)
]

# Display the filtered dataframe
low_agreement_df
Out[52]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type score_difference agreement_proportion
127 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588633 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms 0.311367 -2.113668
155 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588489 0.90 0.85 0.9 0.95 [The transformed sentence maintains the meanin... Terms 0.311511 -2.115114
183 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588578 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms 0.311422 -2.114224
211 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.534698 0.85 0.90 0.8 0.95 [The transformed sentence accurately reflects ... Terms 0.315302 -2.153024
239 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588672 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms 0.311328 -2.113276
267 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.589421 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms 0.310579 -2.105792
295 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.504298 0.85 0.90 0.8 0.95 [The transformed sentence accurately reflects ... Terms 0.345702 -2.457016
323 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588613 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms 0.311387 -2.113867
351 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588633 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms 0.311367 -2.113668
379 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588672 0.90 0.85 0.9 0.95 [The transformed sentence accurately reflects ... Terms 0.311328 -2.113276
InĀ [53]:
# Add a temporary column for the product
df_agree['product'] = df_agree['similarity_score'] * df_agree['semscore']

# Sort the dataframe by the product column
sorted_df = df_agree.sort_values(by='product', ascending=True)

# Drop the temporary column if it's no longer needed
df_agree = df_agree.drop(columns=['product'])

# Display the sorted dataframe
sorted_df
Out[53]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type score_difference agreement_proportion product
295 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.504298 0.85 0.90 0.8 0.95 [The transformed sentence accurately reflects ... Terms 0.345702 -2.457016 0.428654
211 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.534698 0.85 0.90 0.8 0.95 [The transformed sentence accurately reflects ... Terms 0.315302 -2.153024 0.454493
296 § 275.0-5 Order of the Commission An order issued by the Commission under the Act. [(d)] 0.814386 0.60 0.80 0.5 0.40 [The transformed sentence does not accurately ... Terms -0.214386 -1.143860 0.488632
128 § 275.0-5 Order of the Commission An order issued by the Commission under the Act. [(d)] 0.814453 0.60 0.80 0.5 0.40 [The transformed sentence does not accurately ... Terms -0.214453 -1.144528 0.488672
155 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588489 0.90 0.85 0.9 0.95 [The transformed sentence maintains the meanin... Terms 0.311511 -2.115114 0.529640
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
197 § 275.0-2 Non-resident An individual, corporation, partnership, or ot... [(b)(2)] 0.960137 1.00 1.00 1.0 1.00 [The transformed sentence accurately reflects ... Terms 0.039863 0.601373 0.960137
169 § 275.0-2 Non-resident An individual, corporation, partnership, or ot... [(b)(2)] 0.960137 1.00 1.00 1.0 1.00 [The transformed sentence accurately reflects ... Terms 0.039863 0.601373 0.960137
337 § 275.0-2 Non-resident An individual, corporation, partnership, or ot... [(b)(2)] 0.960137 1.00 1.00 1.0 1.00 [The transformed sentence accurately reflects ... Terms 0.039863 0.601373 0.960137
113 § 275.0-2 Non-resident An individual, corporation, partnership, or ot... [(b)(2)] 0.960142 1.00 1.00 1.0 1.00 [The transformed sentence accurately reflects ... Terms 0.039858 0.601421 0.960142
225 § 275.0-2 Non-resident An individual, corporation, partnership, or ot... [(b)(2)] 0.960378 1.00 1.00 1.0 1.00 [The transformed sentence accurately reflects ... Terms 0.039622 0.603782 0.960378

550 rows Ɨ 14 columns

InĀ [54]:
# Display the dataframe with the proportional agreement column
df_agree.sort_values('agreement_proportion', ascending=True)
Out[54]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type score_difference agreement_proportion
295 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.504298 0.85 0.90 0.80 0.95 [The transformed sentence accurately reflects ... Terms 0.345702 -2.457016
211 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.534698 0.85 0.90 0.80 0.95 [The transformed sentence accurately reflects ... Terms 0.315302 -2.153024
155 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588489 0.90 0.85 0.90 0.95 [The transformed sentence maintains the meanin... Terms 0.311511 -2.115114
183 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588578 0.90 0.85 0.90 0.95 [The transformed sentence accurately reflects ... Terms 0.311422 -2.114224
323 § 275.0-5 Protection of investors A consideration for ordering a hearing if it a... [(c)] 0.588613 0.90 0.85 0.90 0.95 [The transformed sentence accurately reflects ... Terms 0.311387 -2.113867
... ... ... ... ... ... ... ... ... ... ... ... ... ...
0 § 275.0-2 3 The Secretary of the Commission (Secretary) wi... [(a)(2)] 0.907310 0.90 0.90 0.85 0.95 [The transformed sentence maintains the core m... Operative_Rules -0.007310 0.926898
344 § 275.0-5 Period of time The timeframe specified in the notice during w... [(a), (b)] 0.905456 0.90 0.95 0.85 0.95 [The transformed sentence maintains the core m... Terms -0.005456 0.945445
116 § 275.0-5 Initiation of the proceeding The process that begins when a notice is publi... [(a)] 0.944867 0.95 0.90 0.95 1.00 [The transformed sentence accurately reflects ... Terms 0.005133 0.948668
97 § 275.0-5 Federal Register The official journal where the notice of initi... [(a)] 0.904537 0.90 0.95 0.85 0.95 [The transformed sentence maintains the core m... Names -0.004537 0.954633
48 § 275.0-2 3 The Secretary of the Commission (Secretary) wi... [(a)(2)] 0.897325 0.90 0.85 0.85 0.95 [The transformed sentence maintains the core m... Operative_Rules 0.002675 0.973246

550 rows Ɨ 13 columns

Correlation analysis using Spearman, Kendall, and Pearson

Kendall

InĀ [55]:
# Compute Kendall's Tau correlation to assess monotonicity
kendall_correlation, p_value_kendall = kendalltau(df_agree['similarity_score'], df_agree['semscore'])

kendall_correlation, p_value_kendall
Out[55]:
(0.2616709607635915, 2.5339459779402854e-15)

Spearman

InĀ [56]:
# Check if the relationship between similarity_score and semscore is monotonic
# Compute Spearman's rank correlation to assess monotonicity
spearman_correlation, p_value = spearmanr(df_agree['similarity_score'], df_agree['semscore'])

spearman_correlation, p_value
Out[56]:
(0.32938238960280797, 2.200917948508083e-15)

Pearson

InĀ [57]:
# Check if the relationship between similarity_score and semscore is monotonic
# Compute Spearman's rank correlation to assess monotonicity
pearsonr_correlation, p_value = pearsonr(df_agree['similarity_score'], df_agree['semscore'])

pearsonr_correlation, p_value
Out[57]:
(0.2573123366775016, 9.114258452255619e-10)
InĀ [58]:
# Calculate the correlation between similarity_score and semscore
correlation = df_agree['similarity_score'].corr(df_agree['semscore'])

correlation
Out[58]:
0.2573123366775016
InĀ [59]:
# Dados
x = df_agree['similarity_score']
y = df_agree['semscore']

# Criar o grÔfico de dispersão
plt.figure(figsize=(8, 6))  # Tamanho do grƔfico
plt.scatter(x, y, alpha=0.5, color="blue", edgecolor="k", label="Data points")

# Adicionar linha de tendĆŖncia linear
slope, intercept, _, _, _ = linregress(x, y)
x_line = np.linspace(min(x), max(x), 100)
y_line = slope * x_line + intercept
plt.plot(x_line, y_line, color="red", linewidth=2, label="Linear Trend Line")

# Melhorar a grid com intervalos mais detalhados
plt.grid(visible=True, which='both', linestyle='--', linewidth=0.5)

# Adicionar título e rótulos
plt.title('Scatterplot: Similarity Score vs SemScore with Trendline')
plt.xlabel('Similarity Score')
plt.ylabel('SemScore')
plt.legend()  # Mostrar legenda

# Mostrar o grƔfico
plt.tight_layout()
plt.show()
No description has been provided for this image

A correlation of -0.107 indicates a weak negative linear relationship between the variables, suggesting that as one variable slightly increases, the other tends to decrease marginally. However, the relationship is negligible, indicating little to no linear association. This weak correlation implies that changes in one variable do not reliably predict changes in the other. Furthermore, the low magnitude does not preclude the possibility of a non-linear relationship, which would require alternative methods of analysis for detection.

Prompt analysis¶

Analyze number of tokens from prompts and documents from last checkpoint using gpt-4o as a reference model.

According to OpenAI | models, the maximum number of tokens (context length) for gpt-4o is 128k.

The cost to use gpt-4o is 2.50 USD per 1m tokens in 2024-10-31. Source: OpenAI | pricing.

Extract elapse times and completions from all sessions.

InĀ [60]:
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
InĀ [61]:
tokens_eval = {"doc_type": [], "elapsed_times": [], "completions": [], "file_infos": []}

for manager, file_info in zip(managers, file_info_list):
    # Process documents
    for key in manager.model_dump()["documents"].keys():
        if key[1].startswith("llm_"):
            doc = manager.retrieve_document(key[0], key[1])
            logger.info(f"Processing: {key[0]}, {key[1]}")
            elapsed_times = doc.elapsed_times
            logger.debug(f"Elapsed time: {elapsed_times}")
            completions = doc.completions
            logger.debug(f"Completions: {completions}")
            tokens_eval["doc_type"].append(key[1])
            tokens_eval["elapsed_times"].append(elapsed_times)
            tokens_eval["completions"].append(completions)
            tokens_eval["file_infos"].append(file_info)
logger.info(f"Executions for evaluation: {len(tokens_eval['doc_type'])}")
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-15 01:44:40 - INFO - Executions for evaluation: 190

Evaluate

InĀ [62]:
# Constants
reference_models = config["REFERENCE_MODELS"]["MAX_CONTEXT_LENGTH"]
price_per_million_tokens = config["REFERENCE_MODELS"]["PRICE_PER_MILLION_TOKENS"]

# Initialize an empty list to store the raw data
raw_data = []

# Assuming tokens_eval is already defined and contains the necessary data
for doc_type, elapsed_times, completions, file_info in zip(
    tokens_eval["doc_type"],
    tokens_eval["elapsed_times"],
    tokens_eval["completions"],
    tokens_eval["file_infos"],
):
    for elapsed_time, completion in zip(elapsed_times, completions):
        raw_data.append(
            (
                file_info["filename"],
                doc_type,
                elapsed_time,
                completion["usage"],
                completion["created"],
                completion["model"],
            )
        )

prompt_data_df = prompt_analysis(raw_data, config["DEFAULT_OUTPUT_DIR"])
Overall Statistics:
 Total Tokens  Number of Samples  Average Elapsed Time (s)  Estimated Cost (USD)  Average Percentage of Context Length (%) Min Created Timestamp Max Created Timestamp                      origin              run_at
      5472633               1210                  4.055949             13.681582                                  3.533467   2024-11-30 00:08:20   2024-12-15 03:44:41 documents-2024-12-08-9.json 2024-12-15 01:44:40

Statistics by Sample Type (doc_type):
                   doc_type  total_tokens  num_samples  average_elapsed_time  average_tokens  estimated_cost  average_percentage_context_length                    filename              run_at
               llm_response        272850           60             28.951583     4547.500000        0.682125                           3.552734 documents-2024-12-08-9.json 2024-12-15 01:44:40
llm_response_classification        370390           50              8.280727     7407.800000        0.925975                           5.787344 documents-2024-12-08-9.json 2024-12-15 01:44:40
     llm_response_transform       2480340          550              2.239824     4509.709091        6.200850                           3.523210 documents-2024-12-08-9.json 2024-12-15 01:44:40
             llm_validation       2349053          550              2.772115     4271.005455        5.872632                           3.336723 documents-2024-12-08-9.json 2024-12-15 01:44:40

Statistics by Model:
            model  total_tokens  num_samples  average_elapsed_time  average_tokens  average_percentage_context_length                    filename              run_at  estimated_cost      cost
gpt-4o-2024-08-06       5472633         1210              4.055949      4522.83719                           3.533467 documents-2024-12-08-9.json 2024-12-15 01:44:40       13.681583 13.681583

Additional Statistics:
 Average Completion Tokens  Average Prompt Tokens  Average Total Tokens per Sample  Total Elapsed Time (s)  Average Tokens per Second                      origin              run_at
                314.194215            4208.642975                       4522.83719             4907.698278                1802.337863 documents-2024-12-08-9.json 2024-12-15 01:44:40
InĀ [63]:
prompt_data_df.describe()
Out[63]:
elapsed_time created completion_tokens prompt_tokens total_tokens reference_context_length price_per_million_tokens tokens_per_second
count 1210.000000 1210 1210.000000 1210.000000 1210.00000 1210.0 1210.0 1210.000000
mean 4.055949 2024-12-11 08:39:27.552892416 314.194215 4208.642975 4522.83719 128000.0 2.5 1802.337863
min 1.502028 2024-11-30 00:08:20 131.000000 1357.000000 1501.00000 128000.0 2.5 127.116341
25% 2.005183 2024-12-09 01:54:09 156.000000 2337.000000 4597.00000 128000.0 2.5 1056.751444
50% 2.466322 2024-12-09 01:55:39 173.000000 4722.000000 5127.00000 128000.0 2.5 1891.422901
75% 2.913875 2024-12-15 03:16:51.750000128 200.000000 5074.000000 5257.00000 128000.0 2.5 2480.162415
max 43.604876 2024-12-15 03:44:41 4517.000000 8120.000000 8590.00000 128000.0 2.5 3466.646635
std 6.358610 NaN 588.970664 1557.668782 1508.34103 0.0 0.0 857.387664
InĀ [64]:
# Running the analysis
stats = summary_statistics(prompt_data_df)
token_usage_analysis(prompt_data_df)
time_efficiency_analysis(prompt_data_df)
cost_analysis(prompt_data_df)
temporal_analysis(prompt_data_df)
group_performance_analysis(prompt_data_df)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2024-12-15 01:44:41 - INFO - Total cost: $13.68
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [65]:
stats
Out[65]:
elapsed_time created completion_tokens prompt_tokens total_tokens reference_context_length price_per_million_tokens tokens_per_second
count 1210.000000 1210 1210.000000 1210.000000 1210.00000 1210.0 1210.0 1210.000000
mean 4.055949 2024-12-11 08:39:27.552892416 314.194215 4208.642975 4522.83719 128000.0 2.5 1802.337863
min 1.502028 2024-11-30 00:08:20 131.000000 1357.000000 1501.00000 128000.0 2.5 127.116341
25% 2.005183 2024-12-09 01:54:09 156.000000 2337.000000 4597.00000 128000.0 2.5 1056.751444
50% 2.466322 2024-12-09 01:55:39 173.000000 4722.000000 5127.00000 128000.0 2.5 1891.422901
75% 2.913875 2024-12-15 03:16:51.750000128 200.000000 5074.000000 5257.00000 128000.0 2.5 2480.162415
max 43.604876 2024-12-15 03:44:41 4517.000000 8120.000000 8590.00000 128000.0 2.5 3466.646635
std 6.358610 NaN 588.970664 1557.668782 1508.34103 0.0 0.0 857.387664